### MDP Value Iteration and Policy Iteration
import argparse
import numpy as np
import gym
import time
from lake_envs import *

np.set_printoptions(precision=3)

parser = argparse.ArgumentParser(description='A program to run assignment 1 implementations.',
                                 formatter_class=argparse.ArgumentDefaultsHelpFormatter)

parser.add_argument("--env",
                    help="The name of the environment to run your algorithm on.",
                    choices=["Deterministic-4x4-FrozenLake-v0", "Stochastic-4x4-FrozenLake-v0"],
                    default="Deterministic-4x4-FrozenLake-v0")

"""
For policy_evaluation, policy_improvement, policy_iteration and value_iteration,
the parameters P, nS, nA, gamma are defined as follows:

	P: nested dictionary
		From gym.core.Environment
		For each pair of states in [1, nS] and actions in [1, nA], P[state][action] is a
		tuple of the form (probability, nextstate, reward, terminal) where
			- probability: float
				the probability of transitioning from "state" to "nextstate" with "action"
			- nextstate: int
				denotes the state we transition to (in range [0, nS - 1])
			- reward: int
				either 0 or 1, the reward for transitioning from "state" to
				"nextstate" with "action"
			- terminal: bool
			  True when "nextstate" is a terminal state (hole or goal), False otherwise
	nS: int
		number of states in the environment
	nA: int
		number of actions in the environment
	gamma: float
		Discount factor. Number in range [0, 1)
"""


def policy_evaluation(P, nS, nA, policy, gamma=0.9, tol=1e-3):
    """Evaluate the value function from a given policy.

    Parameters
    ----------
    P, nS, nA, gamma:
        defined at beginning of file
    policy: np.array[nS]
        The policy to evaluate. Maps states to actions.
    tol: float
        Terminate policy evaluation when
            max |value_function(s) - prev_value_function(s)| < tol
    Returns
    -------
    value_function: np.ndarray[nS]
        The value function of the given policy, where value_function[s] is
        the value of state s
    """

    value_function = np.zeros(nS)

    ############################
    # YOUR IMPLEMENTATION HERE #
    V_new = np.zeros(nS)
    V_old = np.ones(nS) * 1_000
    while max(abs(V_new - V_old)) > tol:
        V_old = V_new.copy()
        for state in range(nS):
            action = policy[state]
            model = P[state][action]
            reward = 0
            for i in range(len(model)):
                reward += model[i][0] * (model[i][2] + gamma * V_old[model[i][1]])
            V_new[state] = reward
    value_function = V_new

    ############################
    return value_function


def policy_improvement(P, nS, nA, value_from_policy, policy, gamma=0.9):
    """Given the value function from policy improve the policy.

    Parameters
    ----------
    P, nS, nA, gamma:
        defined at beginning of file
    value_from_policy: np.ndarray
        The value calculated from the policy
    policy: np.array
        The previous policy.

    Returns
    -------
    new_policy: np.ndarray[nS]
        An array of integers. Each integer is the optimal action to take
        in that state according to the environment dynamics and the
        given value function.
    """

    new_policy = np.zeros(nS, dtype='int')

    ############################
    for state in range(nS):
        action_reward_dict = dict()
        for action in range(nA):
            model = P[state][action]
            reward = 0
            for i in range(len(model)):
                reward += model[i][0] * (model[i][2] + gamma * value_from_policy[model[i][1]])
            action_reward_dict[action] = reward
        best_action = max(action_reward_dict, key=action_reward_dict.get)
        new_policy[state] = best_action

    ############################
    return new_policy


def policy_iteration(P, nS, nA, gamma=0.9, tol=10e-3):
    """Runs policy iteration.

    You should call the policy_evaluation() and policy_improvement() methods to
    implement this method.

    Parameters
    ----------
    P, nS, nA, gamma:
        defined at beginning of file
    tol: float
        tol parameter used in policy_evaluation()
    Returns:
    ----------
    value_function: np.ndarray[nS]
    policy: np.ndarray[nS]
    """

    value_function = np.zeros(nS)
    policy = np.zeros(nS, dtype=int)

    ############################
    policy = np.random.randint(0, nA - 1, nS)
    i = 0
    while True:
        i += 1
        value_function = policy_evaluation(P, nS, nA, policy, gamma=gamma, tol=tol)
        new_policy = policy_improvement(P, nS, nA, value_function, policy, gamma=gamma)
        if np.array_equal(policy, new_policy):
            policy = new_policy.copy()
            break
        else:
            policy = new_policy.copy()
    print(f'Policy Iteration converged after {i} iterations')
    ############################
    return value_function, policy


def value_iteration(P, nS, nA, gamma=0.9, tol=1e-3):
    """
    Learn value function and policy by using value iteration method for a given
    gamma and environment.

    Parameters:
    ----------
    P, nS, nA, gamma:
        defined at beginning of file
    tol: float
        Terminate value iteration when
            max |value_function(s) - prev_value_function(s)| < tol
    Returns:
    ----------
    value_function: np.ndarray[nS]
    policy: np.ndarray[nS]
    """

    value_function = np.zeros(nS)
    policy = np.zeros(nS, dtype=int)
    ############################
    V_new = value_function
    V_old = np.ones(nS) * 1_000
    iterations = 0
    while max(abs(V_new - V_old)) > tol:
        iterations += 1
        V_old = V_new.copy()
        for state in range(nS):
            action_reward_dict = dict()
            for action in range(nA):
                model = P[state][action]
                reward = 0
                for i in range(len(model)):
                    reward += model[i][0] * (model[i][2] + gamma * V_old[model[i][1]])
                action_reward_dict[action] = reward
            V_new[state] = max(action_reward_dict.values())

    value_function = V_new
    policy = policy_improvement(P, nS, nA, value_function, policy, gamma=gamma)

    print(f'Value Iteration converged after {iterations} iterations')
    ############################
    return value_function, policy


def render_single(env, policy, max_steps=100):
    """
      This function does not need to be modified
      Renders policy once on environment. Watch your agent play!

      Parameters
      ----------
      env: gym.core.Environment
        Environment to play on. Must have nS, nA, and P as
        attributes.
      Policy: np.array of shape [env.nS]
        The action to take at a given state
    """

    episode_reward = 0
    ob = env.reset()
    for t in range(max_steps):
        env.render()
        time.sleep(0.25)
        a = policy[ob]
        ob, rew, done, _ = env.step(a)
        episode_reward += rew
        if done:
            break
    env.render();
    if not done:
        print("The agent didn't reach a terminal state in {} steps.".format(max_steps))
    else:
        print("Episode reward: %f" % episode_reward)


# Edit below to run policy and value iteration on different environments and
# visualize the resulting policies in action!
# You may change the parameters in the functions below
if __name__ == "__main__":
    # read in script argument
    # 'Deterministic-4x4-FrozenLake-v0', 'Stochastic-4x4-FrozenLake-v0'
    env_id = 'Deterministic-4x4-FrozenLake-v0'

    # Make gym environment
    env = gym.make(env_id)

    print("\n" + "-" * 25 + "\nBeginning Policy Iteration\n" + "-" * 25)

    V_pi, p_pi = policy_iteration(env.P, env.nS, env.nA, gamma=0.9, tol=1e-3)
    render_single(env, p_pi, 100)

    print("\n" + "-" * 25 + "\nBeginning Value Iteration\n" + "-" * 25)

    V_vi, p_vi = value_iteration(env.P, env.nS, env.nA, gamma=0.9, tol=1e-3)
    render_single(env, p_vi, 100)
